******************
**# A. Trade Data
******************
*SIC87 import, export data from Peter Schott
use "raw/xm_sic87_72_105_20120424.dta", clear
collapse (sum) customs x, by(sic year)
save "processing/schott_imports_exports_aggregate", replace

	
*****************
**# B. NBER Data 
*****************
use "raw/nberces5818v1_s1987.dta", clear

***************
**## B.1 Wages
***************
* Production workers
*	prodw: payroll in $1m
*	prode: workers in 1000s
gen ln_prod_wage = ln(prodw/prode)
label var ln_prod_wage "Log Production Wage"

* Non-production workers
*	pay: payroll in $1m
*	emp: employment in 1000s
gen ln_non_prod_wage = ln((pay-prodw)/(emp-prode))
label var ln_non_prod_wage "Log Non-Production Wage"

* Wage-Skill Premium
gen ln_wsp = ln_non_prod_wage-ln_prod_wage
label var ln_wsp "Log Wage-Skill Premium"


*************
**## B.2 Emp
*************
* Production workers
gen unskill_emp = prode
gen ln_prod_emp = ln(prode)
label var ln_prod_emp "Log Production Employment"

* Non-production workers
gen skill_emp = emp-prode
gen ln_non_prod_emp = ln(emp-prode)
label var ln_non_prod_emp "Log Non-Production Employment"

* Log Employment Difference
gen ln_skill_emp_diff = ln_non_prod_emp-ln_prod_emp
label var ln_skill_emp_diff "Log Employment Difference"


************************
**## B.3 Labor Payments
************************
* Production workers
gen unskill_pay = prodw
gen ln_prod_pay = ln(prodw)
label var ln_prod_pay "Log Labor Payments, Production"

* Non-production workers
gen skill_pay = pay-prodw
gen ln_non_prod_pay = ln(pay-prodw)
label var ln_non_prod_pay "Log Labor Payments, Non-Production"

*Log Payment Difference
gen ln_skill_pay_diff = ln_non_prod_pay-ln_prod_pay
label var ln_skill_pay_diff "Log Pay Difference"


**********************
**## B.4 Skill Shares
**********************
*Skill Pay Share
gen skill_pay_share = (pay-prodw)/pay
label var skill_pay_share "Non-Production Pay Share"

*Skill Employment Share
gen skill_emp_share = (emp-prode)/emp
label var skill_emp_share "Non-Production Employment Share"


****************
**## B.5 Output
****************
*Shipments (Not Deflated)
gen ln_shipment = ln(vship)
label var ln_shipment "Log Ship"

*Investment  (Not Deflated)
gen ln_invest = ln(invest)
label var ln_invest "Log Invest"


****************************
**## B.6 Payments/Shipments
****************************
gen pay_ship = pay/vship
label var pay_ship "pay/ship"

gen ln_pay_ship = ln(pay_ship)
label var ln_pay_ship "ln(pay/ship)"


**************************
**## B.7 Controls (70/78)
**************************
* Price indices
foreach v in piship pimat piinv pien{
sum `v' if year==1970
loc base = `r(mean)'
replace `v' = 100*`v'/`base'

}

* Investment
gen invest_def = invest/piinv
gen ln_invest_def = ln(invest_def)
gen ln_invest_def_1970 = ln_invest_def*(year==1970)
replace ln_invest_def_1970 = . if year!=1970
bys sic: egen ln_invest_def_70 = max(ln_invest_def_1970)

gen ln_invest_def_1978 = ln_invest_def*(year==1978)
replace ln_invest_def_1978 = . if year!=1978
bys sic: egen ln_invest_def_78 = max(ln_invest_def_1978)

gen ln_invest_70 = ln_invest*(year==1970)
replace ln_invest_70 = . if year != 1970
egen m_ln_invest_70 = min(ln_invest_70), by(sic)
replace ln_invest_70 = m_ln_invest_70
drop m_ln_invest_70

gen ln_invest_78 = ln_invest*(year==1978)
replace ln_invest_78 = . if year != 1978
egen m_ln_invest_78 = min(ln_invest_78), by(sic)
replace ln_invest_78 = m_ln_invest_78
drop m_ln_invest_78

* Capital/Labor
gen cap_lab = (cap/emp)*(year==1978)
egen ln_cap_lab_78 = max(cap_lab), by(sic)
replace ln_cap_lab_78 = ln(ln_cap_lab_78)

replace cap_lab = (cap/emp)*(year==1970)
egen ln_cap_lab_70 = max(cap_lab), by(sic)
replace ln_cap_lab_70 = ln(ln_cap_lab_70)

* Emp
gen ln_emp = ln(emp)*(year==1978)	
egen ln_emp_78 = max(ln_emp), by(sic)

replace ln_emp = ln(emp)*(year==1970)
egen ln_emp_70 = max(ln_emp), by(sic)

replace ln_emp = ln(emp)

* Skill intensity
gen skill_emp_share_min = skill_emp_share*(year==1978)
egen skill_emp_share_78 = max(skill_emp_share_min), by(sic)

replace skill_emp_share_min = skill_emp_share*(year==1970)
egen skill_emp_share_70 = max(skill_emp_share_min), by(sic)

* Materials/Shipments
gen mat_ship = matcost/vship
gen mat_ship_def = (matcost/pimat)/(vship/piship)
gen mat_ship_1970 = mat_ship if year==1970
gen mat_ship_def_1970  = mat_ship_def if year==1970

gen mat_ship_def_1978 = mat_ship_def if year==1978

bys sic: egen mat_ship_70 = max(mat_ship_1970)
bys sic: egen mat_ship_def_70 = max(mat_ship_def_1970)
bys sic: egen mat_ship_def_78 = max(mat_ship_def_1978)

drop mat_ship_1970 
gen ln_mat_ship_70 = ln(mat_ship_70)
gen ln_mat_ship = ln(mat_ship)

**## B.8 Clean up
drop *1970
keep sic year ln_* skill_*_share vship *_78 *_70  ///
ln_shipment skill_emp skill_pay unskill_emp unskill_pay pay emp ln_pay_ship pay_ship mat_ship*

save "processing/nber_wsp", replace


************************
**# C. Upstream tariffs
************************

************************
**## C.1 SIC I-O Matrix
************************
* Total requirements
import delimited "raw/1977 Total Req Coeff 537-level Data.txt", delimiter(space) clear 
rename v1 commodity_input
rename v2 commodity_output
rename v4 tot_req_coeff

* Keep manufacturing block of IO table
keep if commodity_input >= 130100 &  commodity_input <= 641200
keep if commodity_output >= 130100 &  commodity_output <= 641200

replace tot_req_coeff = tot_req_coeff-1 if commodity_input== commodity_output
* From BEA: Commodity-by-commodity total requirements coefficients.  
* These entries represent the commodity (row) output required per dollar of each 
* commodity (column) delivered to final demand.*/
drop v3 v5 v6

*Mapping commodity inputs to SIC87
	* Split commodity input requirement per $ of commodity output into constiutent SIC input codes
	* This will give SIC input requirement per $ of commodity output
rename commodity_input commodity
joinby commodity using "processing/commodity_sic87_conc", unmatched(both)
	* m=1: 726 input-by-output obs (0.38%) in the IO table, representing two inputs (220103 610603), not in commodity->SIC concordance
	* m=2: no obs
	* Have now concorded commodity_input to sic87
keep if _m == 3
replace tot_req_coeff = tot_req_coeff*comm_sic87_share
collapse (sum) tot_req_coeff, by(sic87 commodity_output)
rename sic87 sic87_input

*Mapping commodity output to SIC87
	* Have SIC input requirement per $ of commodity output
	* Need SIC input requirement per $ of SIC output
	* So for given SIC input code and given SIC output code: want to take weighted average across of input requirement per $ of output 
	* across constituent commodity output codes where weight corresponds to importance of commodity output code within SIC output code
rename commodity_output commodity
joinby commodity using "processing/sic87_commodity_conc", unmatched(both)
	* m=1: 908 input-by-output obs (0.39%) in the IO table, representing two inputs 220103 610603), not in commodity->SIC concordance
	* m=2: no obs
keep if _m == 3
drop _merge

replace tot_req_coeff = tot_req_coeff*sic87_comm_share
collapse (sum) tot_req_coeff, by(sic87 sic87_input)
rename sic87 sic87_output

egen tot_input = total(tot_req_coeff), by(sic87_output)
gen input_share = tot_req_coeff/tot_input
replace input_share=0 if tot_req_coeff==0 & tot_input==0
assert !missing(input_share)

label var tot_input     "Input usage of all inputs (total req, not direct req) per $ of output"
label var input_share   "tot_req_coeff/tot_input"
label var tot_req_coeff "Input usage (total req, not direct req) per $ of output"

save "processing/sic_io_matrix", replace


**************************
**## C.2 Upstream Tariffs
**************************
* Imports from Schott
use "raw/xm_sic87_72_105_20120424.dta", clear
collapse (sum) customs, by(sic year)
rename customs imports
* Keep MFG Only
keep if sic >= 2000 & sic <4000
keep if year == 1978
save "processing/imports", replace

use "processing/sic_io_matrix", clear
assert !missing(input_share)
rename sic87_input sic

merge m:1 sic using "processing/imports"
tab _m
	* m=1: 6287 obs (3.08%) input-output obs from manuf block of IO table, represting 14 sic inputs, where no imports of input
	* m=2: 4 obs (0.00%) where imports of sic input but sic input not in sic IO table (2261 2262 2517 3716)
keep if _m == 3


*Note: the approach below is the following.
* 1. Take share of output ind o's inputs accounted for by input i, call it alpha_io = a_io/sum_i(a_io)
* 2. Scale it by US imported inputs m_i to get alpha_io*m_i
* 3. To aggregate tariffs across inputs i for output industry o, generate weights w_io = 
*	 alpha_io*m_i/sum_i(alpha_io*m_i). The interpretation is that this procedure adjusts the importance of input i 
*	 for industry o through scaling by imports of input i.
* 	 Mathematically, this is equivalent to various alternatives:
* A. Instead of scaling by m_i in step 2 which is a continuous variable >=0, scale by m_i/mean_i(m_i) which is 
*	 imports of i relative to mean imports so that an input with imports equal to average US imports has a scale 
*	 factor of 1 and hence the scale factor scales the IO coefficient up or down based on input
*    i imports relative to mean imports. The equivalence is because this divides all terms in numerator and 
*	 denominator by the common factor mean_i(m_i)
* B. Instead of defining the IO coefficient in step 1 as o's inputs of i as a share of o's total inputs, define it as 
*	 o's input usage of i as share of economy's input usage of i: alpha_io = a_io/sum_i(alpha_io). This is equivalent 
*	 because just multiplying each term in numerator and denominator of w_io by sum_i(a_io) and dividing each term in 
*	 numerator and denominator by sum_o(a_io). This has the intuitive interpetation that take input i imports and 
*	 allocate to output industries according to their share of economy wide usage of input i.
*    This gives a level of imported input i for output industry o, m_io. The weight output industry o places on input 
*	 i's tariff is then simply the share of output ind o's imported inputs accounted for by input i: w_io = 
*	 m_io/sum_i(m_io)

gen imports_weight = imports*input_share
	* Input_share is share of input used by output industry. So, will allocate imports across output industries as inputs using input_share as the weight

egen import_input_share_tot = total(imports_weight), by(sic87_output year)
	* Total imports used as input by output industry
gen import_input_share = imports_weight/import_input_share_tot
	* Particular imported input as share of output industry total imported inputs 

keep sic sic87_output import_input_share year input_share imports
drop year

joinby sic using "raw/ave_iv_ols", unmatched(both)
	* Get input tariffs
	* m=1: Imported inputs we don't have tariff IV (0.97% of obs at input-output-year level, 46 unqiue sic codes)
	* These codes are never "reached" by sitc->hs92->sic87 concordance. About half of the time it's because the sic
	* codes are not in the hs92-sic concordance. The other half of the time they're in the concordance but not for 
	* year=1992. In this case, it's often because they start appearing in the concordance in 1993 or 1999 
	* m=2: input tariff obs for which we don't have input imports. This is 61 unique sic codes but only 3 unique manuf codes 2517 3761 3953.
	*	   2517 is not in our SIC IO table so we don't have IO-table implied input imports for it. 3761 and 3953 are not imported in 1978 in
	*	   Schott SIC import data
unique sic if _m==2 & inrange(sic,2000,4000)
	
keep if _m == 3
drop _merge

egen scale = total(import_input_share), by(sic87_output year)
	* Rescale output industry's imported input weights so they sum to 1 after only keeping _m==3
replace import_input_share = import_input_share/scale

gen ave_iv_upstream 	 = import_input_share*ave_iv_swiss
gen ave_ols_upstream 	 = import_input_share*ave_ols
gen ave_iv_upstream_col1 = import_input_share*ave_swiss

collapse (sum) ave_iv_upstream ave_ols_upstream ave_iv_upstream_col1, by(sic87_output year)

rename sic87_output sic

rename (ave_iv_upstream ave_ols_upstream ave_iv_upstream_col1) (ave_iv_upstream_impwt ave_ols_upstream_impwt ave_iv_upstream_col1_impwt)

save "processing/upstream_tariffs_import_weighted", replace


******************
**# D. Merge Data
******************

* Tariffs					
use "raw/ave_iv_ols.dta", clear

* NBER data
mmerge sic year using "processing/nber_wsp", type(1:1) uif(inrange(year,1970,1988))
assert _m!=1
	* m=2: sic-years in nber data but not our IV tariff data. All of the problematic sic codes are never "reached" 
	*      but out sitc->hs92->sic concordance.
	*	   Mostly, the unique sic codes are the obs identified above due to concordance issue
	*      But, about 14 additional sic codes not due to that concordance issue. Instead, these sic codes are simply 
	*      no in the hs92->sic concordance at all.
	*	   This is depite fact they're in the full list of SIC87 codes on NBER website. Nothing we can do about these.
keep if _m == 3
drop _merge

* Trade data
mmerge sic year using "processing/schott_imports_exports_aggregate.dta", type(1:1) uif(year<=1988 & inrange(sic,2000,4000))
	* m=1: have IV tariff/nber data but not in Schott import data. 4 unique sic codes 
	* m=2: have Schott import data but not in IV tariff/nber data. 49 unique SIC codes, essentially as identified 
	*      above stemming from sitc->sic concordance
drop if _merge == 2
drop _merge

gen ln_imp = ln(customs)
gen ln_exp = ln(x)

gen imp_pen     =    customs/(vship+customs-x)
gen ln_imp_pen  = ln(customs/(vship+customs-x))
gen ln_exp_ship = ln(x/vship)
gen exp_ship    =    x/vship
gen ln_imp_ship = ln(customs/vship)

* Upstream tariffs
merge 1:1 sic year using  "processing/upstream_tariffs_import_weighted"
* m=1: have IV tariff/nber data but not in IO tariff data. 1 unique sic codes 
* m=2: have IO tariff data but not in IV tariff/nber data. 53 unique SIC codes, essentially as identified above stemming from sitc->sic concordance
drop if _m == 2
drop _merge

* Clean up
label var ave_ols "\$ AVE_{it}  \$"
label var ave_ols_upstream_impwt "\$  AVE_{it}^{Up}  \$"
gen sic2 = floor(sic/100)
egen sic2_year = group(sic2 year)

save "processing/SIC87_Final_Dataset.dta", replace
